In [None]:
import pytz
import datetime
import marimo as mo

india_timezone = pytz.timezone("Asia/Kolkata")
now = datetime.datetime.now(india_timezone)

curr = now.strftime("%Y-%m-%d, %I:%M:%S %p %Z")

mo.md(
    rf"""
# Week - 9.2

**Submission Date:** `This assignment will not be graded and is only for practice.`

**Last Updated:** `{curr}`
"""
)

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.neural_network import MLPClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score

In [None]:
mo.md("""### About the dataset:

This dataset is originally from the National Institute of Diabetes and Digestive and Kidney Diseases. The objective of the dataset is to diagnostically predict whether or not a patient has diabetes, based on certain diagnostic measurements included in the dataset.

**Dataset:** \"https://drive.google.com/file/d/10Dj5jGYoqc7T5AMV_jBXJzDQQscHbrQ9/view?usp=sharing\"""").callout()

In [None]:
df = pd.read_csv(
    "https://drive.google.com/uc?id=10Dj5jGYoqc7T5AMV_jBXJzDQQscHbrQ9&export=download"
)

In [None]:
X = df.drop(columns="Outcome")
y = df["Outcome"]

## Instructions for questions 1, 2 and 3

Load the diabetes dataset.
Separate features and label.

### Question 1

Which of the following feature pairs have negative correlation?

In [None]:
sns.heatmap(X.corr())

### Question 3

Which of the following features has the highest standard deviation?

In [None]:
X.columns[np.array([X[i].std() for i in X.columns]).argmax()]

### Question 4

Is there any missing values in the dataset?

In [None]:
"Yes" if X.isna().sum().sum() else "No"

## Instructions for questions 4 to 7

split the dataset into training and test dataset in **80:20** proportion with

```
"random_state": 1
```

Create a pipeline with scaler as StandardScaler and classifier as MLPClassifier.

Classifier should have the following properties:

- Classifier should have three hidden layers with 10 neurons each.
- Set
    ```
    activation='relu', solver='sgd', alpha= 1e-4, learning_rate_init= 0.2 ,max_iter=500, random_state=1
    ```

In [None]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=1
)

In [None]:
pipeline = Pipeline(
    [
        ("scalar", StandardScaler()),
        (
            "model",
            MLPClassifier(
                hidden_layer_sizes=(10, 10, 10),
                activation="relu",
                solver="sgd",
                alpha=1e-4,
                learning_rate_init=0.2,
                max_iter=500,
                random_state=1,
            ),
        ),
    ]
)

### Question 4

What will be the mean accuracy of the model on the training data?

In [None]:
pipeline.fit(X_train, y_train)
accuracy_score(y_train, pipeline.predict(X_train))

### Question 5

What will be the mean accuracy of the model on the test data?

In [None]:
accuracy_score(y_test, pipeline.predict(X_test))

### Question 6

What is the output activation function used by the classifier?

In [None]:
pipeline.named_steps["model"].out_activation_

### Question 7

How many weights (excluding intercepts) between input layer and first hidden layer were learnt by the classifier?

In [None]:
X_train.shape[1] * pipeline.named_steps["model"].hidden_layer_sizes[0]